{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import h2o4gpu\n",
    "import h2o4gpu.util.import_data as io\n",
    "import h2o4gpu.util.metrics as metrics\n",
    "from tabulate import tabulate\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading Data with Pandas\n",
      "(23999, 25)\n",
      "Original m=23999 n=24\n",
      "Size of Train rows=19200 & valid rows=4799\n",
      "Size of Train cols=24 valid cols=24\n",
      "Size of Train cols=25 & valid cols=25 after adding intercept column\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "Import Data for H2O GPU Edition\n",
    "\n",
    "This function will read in data and prepare it for H2O4GPU's GLM solver\n",
    "\n",
    "Parameters\n",
    "----------\n",
    "data_path : str\n",
    "             A path to a dataset (The dataset needs to be all numeric)\n",
    "use_pandas : bool\n",
    "              Indicate if Pandas should be used to parse\n",
    "intercept : bool\n",
    "              Indicate if intercept term is needed\n",
    "valid_fraction : float\n",
    "                  Percentage of dataset reserved for a validation set\n",
    "classification : bool\n",
    "                  Classification problem?\n",
    "Returns\n",
    "-------\n",
    "If valid_fraction > 0 it will return the following:\n",
    "    train_x: numpy array of train input variables\n",
    "    train_y: numpy array of y variable\n",
    "    valid_x: numpy array of valid input variables\n",
    "    valid_y: numpy array of valid y variable\n",
    "    family : string that would either be \"logistic\" if classification is set to True, otherwise \"elasticnet\"\n",
    "If valid_fraction == 0 it will return the following:\n",
    "    train_x: numpy array of train input variables\n",
    "    train_y: numpy array of y variable\n",
    "    family : string that would either be \"logistic\" if classification is set to True, otherwise \"elasticnet\"\n",
    "\"\"\"\n",
    "import os\n",
    "file = os.path.join(os.getcwd(), \"creditcard.csv\")\n",
    "if not os.path.exists(file):\n",
    "   !wget https://s3.amazonaws.com/h2o-public-test-data/h2o4gpu/open_data/creditcard.csv\n",
    "\n",
    "train_x,train_y,valid_x,valid_y,family=io.import_data(data_path=\"creditcard.csv\", \n",
    "                                                        use_pandas=True, \n",
    "                                                        intercept=True,\n",
    "                                                        valid_fraction=0.2,\n",
    "                                                        classification=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Setting up solver\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "Set up instance of H2O4GPU's GLM solver with default parameters\n",
    "\n",
    "Need to pass in `family` to indicate problem type to solve\n",
    "\"\"\"\n",
    "print(\"Setting up solver\")\n",
    "model = h2o4gpu.ElasticNetH2O(family=family)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Solving\n",
      "CPU times: user 17.5 s, sys: 2.84 s, total: 20.3 s\n",
      "Wall time: 11 s\n",
      "Done Solving\n",
      "Logloss per alpha value (-1.00 = missing)\n",
      "\n",
      "|   Alphas |   Train |   CV |   Valid |\n",
      "|---------:|--------:|-----:|--------:|\n",
      "|     0.00 |    0.48 | 0.48 |   -1.00 |\n",
      "|     0.25 |    0.48 | 0.48 |   -1.00 |\n",
      "|     0.50 |    0.48 | 0.48 |   -1.00 |\n",
      "|     0.75 |    0.48 | 0.48 |   -1.00 |\n",
      "|     1.00 |    0.48 | 0.48 |   -1.00 |\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "Fit GLM Solver\n",
    "\"\"\"\n",
    "print(\"Solving\")\n",
    "%time model.fit(train_x, train_y)\n",
    "print(\"Done Solving\")\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Predictions per alpha\n",
      "[[ 0.76320577  0.14472587  0.14279237 ...,  0.27377224  0.2479739\n",
      "   0.17788069]\n",
      " [ 0.76512319  0.1458478   0.14631642 ...,  0.2623947   0.24248317\n",
      "   0.17817149]\n",
      " [ 0.76369005  0.14476402  0.149213   ...,  0.25479093  0.24055323\n",
      "   0.17880464]\n",
      " [ 0.76870668  0.14425999  0.15220751 ...,  0.255465    0.2437907\n",
      "   0.18548389]\n",
      " [ 0.7688567   0.14338347  0.14736323 ...,  0.25683942  0.24017039\n",
      "   0.17752922]]\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "Make predictions on validation set\n",
    "\"\"\"\n",
    "print(\"Predictions per alpha\")\n",
    "preds = model.predict_proba(valid_x, valid_y)\n",
    "print(preds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Logloss for alpha =  [ 0.]\n",
      "0.439005\n",
      "Logloss for alpha =  [ 0.25]\n",
      "0.439057\n",
      "Logloss for alpha =  [ 0.5]\n",
      "0.439167\n",
      "Logloss for alpha =  [ 0.75]\n",
      "0.439402\n",
      "Logloss for alpha =  [ 1.]\n",
      "0.439157\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "Get logloss or rmse for validation set per alpha\n",
    "\"\"\"\n",
    "for i in range(model.n_alphas):\n",
    "    if family == \"logistic\":\n",
    "        print(\"Logloss for alpha = \",model.alphas_best[i])\n",
    "        print(metrics.log_loss(valid_y, preds[i]))\n",
    "    else:\n",
    "        print(\"RMSE for alpha = \",model.alphas_best[i])\n",
    "        print(metrics.rmse(valid_y,preds[i]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
